# importing necessary libraries
library(tidyverse)
library(knitr)
library(readxl)
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
options(scipen = 4)
Introduction
# importing the dataset
ltv <- read_excel("C:/Users/shiri/Dropbox/CMU - 2nd sem/Data Mining/Project/ltv Dataset.xlsx", 1)
# dimension of the dataset
dim(ltv)
## [1] 403835 9
# names of columns in the dataset
colnames(ltv)
## [1] "id" "status" "gender" "date" "pages" "onsite"
## [7] "entered" "completed" "holiday"
# checking the first 10 rows of the dataset
kable(head(ltv, 10))
| 1 |
0 |
M |
2014-10-31 |
7 |
3 |
1 |
1 |
0 |
| 1 |
1 |
M |
2014-11-01 |
6 |
8 |
0 |
0 |
0 |
| 1 |
1 |
M |
2014-11-02 |
6 |
20 |
1 |
0 |
0 |
| 1 |
1 |
M |
2014-12-15 |
1 |
1 |
0 |
0 |
0 |
| 1 |
1 |
M |
2014-12-19 |
1 |
1 |
0 |
0 |
0 |
| 1 |
2 |
M |
2014-12-23 |
0 |
0 |
0 |
0 |
0 |
| 2 |
0 |
F |
2013-09-07 |
7 |
23 |
1 |
1 |
0 |
| 2 |
1 |
F |
2013-09-08 |
9 |
30 |
1 |
1 |
0 |
| 2 |
1 |
F |
2013-09-09 |
7 |
19 |
1 |
1 |
0 |
| 2 |
1 |
F |
2013-09-11 |
8 |
3 |
1 |
0 |
0 |
Exploratory Data Analysis
Describing variables
# getting a descriptive summary of the dataset
kable(summary(ltv))
|
Min. : 1 |
Min. :0.0000 |
Length:403835 |
Min. :2011-01-01 00:00:00 |
Min. : 0.000 |
Min. : 0.000 |
Min. :0.0000 |
Min. :0.0000 |
Min. :0.0000 |
|
1st Qu.: 2537 |
1st Qu.:1.0000 |
Class :character |
1st Qu.:2012-06-29 00:00:00 |
1st Qu.: 3.000 |
1st Qu.: 2.000 |
1st Qu.:1.0000 |
1st Qu.:0.0000 |
1st Qu.:0.0000 |
|
Median : 5025 |
Median :1.0000 |
Mode :character |
Median :2013-05-04 00:00:00 |
Median : 5.000 |
Median : 5.000 |
Median :1.0000 |
Median :1.0000 |
Median :0.0000 |
|
Mean : 5017 |
Mean :0.9909 |
NA |
Mean :2013-04-10 19:01:17 |
Mean : 5.018 |
Mean : 8.831 |
Mean :0.7821 |
Mean :0.5627 |
Mean :0.2277 |
|
3rd Qu.: 7495 |
3rd Qu.:1.0000 |
NA |
3rd Qu.:2014-01-26 00:00:00 |
3rd Qu.: 7.000 |
3rd Qu.: 11.000 |
3rd Qu.:1.0000 |
3rd Qu.:1.0000 |
3rd Qu.:0.0000 |
|
Max. :10000 |
Max. :2.0000 |
NA |
Max. :2014-12-31 00:00:00 |
Max. :10.000 |
Max. :220.000 |
Max. :1.0000 |
Max. :1.0000 |
Max. :1.0000 |
# checking for NA values
sum(is.na(ltv))
## [1] 0
# converting numeric variables into factors and date into date
ltv$status <- as.factor(ltv$status)
ltv$entered <- as.factor(ltv$entered)
ltv$completed <- as.factor(ltv$completed)
ltv$holiday <- as.factor(ltv$holiday)
ltv$date <- as.Date(ltv$date)
# getting distinct gender count
gender_count <- ltv %>%
group_by(gender) %>%
summarize(n_distinct(id))
kable(gender_count, col.names = c("Gender", "# Unique Customers"))
# getting current status or the last status in the dataset of every customer
current_status_df <- ltv %>%
group_by(id) %>%
arrange(id) %>%
filter(row_number() == n())
kable(table(current_status_df$status), col.names = c("Current Status", "# Unique Customers"))
Univariate Analysis
# plotting histograms to get an idea of distribution of different variables
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.6.3
p1 <- ggplot(data = ltv, aes(x = date, fill = I("lightblue"))) +
geom_histogram() +
xlab("Date of login") +
theme_bw() +
geom_vline(aes(xintercept = mean(date)), col = 'red')
p2 <- ggplot(data = ltv, aes(x = pages, fill = I("lightblue"))) +
geom_histogram() +
xlab("# Pages visited by the user in a session") +
theme_bw() +
geom_vline(aes(xintercept = mean(pages)), col = 'red')
p3 <- ggplot(data = ltv, aes(x = onsite, fill = I("lightblue"))) +
geom_histogram() +
xlab("Minutes spent by the user on the website") +
theme_bw() +
geom_vline(aes(xintercept = mean(onsite), col = 'red'))
grid.arrange(p1, p2, p3)

# checking for outliers in the variable onsite
p4 <- ggplot(data = ltv, aes(y = onsite, fill = I("lightblue"))) +
geom_boxplot() +
xlab("Minutes spent by the user on the website") +
theme_bw()
ggplotly(p4)
# log transforming onsite variable
ltv$log.onsite <- log(1 + ltv$onsite)
# re-checking the distribution
p5 <- ggplot(data = ltv, aes(x = log.onsite, fill = I("lightblue"))) +
geom_histogram() +
xlab("Log of minutes spent by the user on the website") +
theme_bw() +
geom_vline(aes(xintercept = mean(log.onsite)), col = 'red')
ggplotly(p5)